## Import Modules
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.metrics import completeness_score, homogeneity_score
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
import math
from sklearn import feature_selection
import graphviz
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree
import seaborn as sns
from sklearn import preprocessing
import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
3#from sklearn import tree, naive_bayes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
##get working directory
os.getcwd()
os.chdir('C:\\Depaul_Win7\\DSC 478 Machine Learning\\Final Project\\health_adult')
##load data via read_csv
health_df = pd.read_csv('samadult.csv', sep=",")
health_df.head()
#check types
health_df.dtypes
columnNames = list(health_df.head(0))
print(columnNames)
#create reduced dataframe
#create reduced dataframe
health_red = health_df[['SEX', 'R_MARITL', 'MRACRPI2', 'REGION', 'PAR_STAT', 'DOINGLWA', 'SUPERVIS', 'WRKCATA', 'HOURPDA', 'PDSICKA', 'WRKLYR4', 'HYPEV', 'HYBPLEV', 'CHLEV', 'CHDEV', 'MIEV', 'STREV', 'COPDEV', 'AASMEV', 'ULCEV', 'CANEV', 'DBHVPAY', 'DBHVCLY', 'DBHVWLY', 'DBHVPAN', 'DBHVCLN', 'DBHVWLN', 'DIBREL', 'DIBEV1', 'DIBPRE2', 'EPILEP1', 'AHAYFYR', 'SINYR', 'CBRCHYR', 'KIDWKYR', 'LIVYR', 'JNTSYMP', 'ARTH1', 'PAINECK', 'PAINLB', 'PAINFACE', 'AMIGR', 'ACOLD2W', 'AINTIL2W', 'AHEARST1', 'AVISION', 'VIM_GLEV', 'VIM_MDEV', 'VIMGLASS', 'AVISACT', 'CHPAIN6M', 'AHSTATYR', 'FLA1AR', 'SPECEQ', 'ALC1YR', 'CIGAREV2', 'ECIGEV2', 'SMKSTAT2', 'APLKIND', 'AWORPAY', 'ADNLONG2', 'ASRGYR', 'AMDLONGR', 'HIT1A', 'HIT2A', 'HIT3A', 'HIT4A', 'FLUVACYR', 'LIVEV', 'ASICPUSE', 'ASIRETR', 'ASIMEDC', 'ASISTLV', 'ASICNHC', 'AWEBUSE', 'YTQU_YG1','ALCSTAT','YRSWRKPA', 'ASISLEEP', 'AHEIGHT', 'AWEIGHTP', 'BMI', 'BEDDAYR', 'CLCKTP', 'AGE_P', 'AHCNOYR2', 'LOCALL1B']]
#recode race
health_red['MRACRPI2'] = health_red['MRACRPI2'].replace([9,10,11,15,16,17], 4)
health_red['MRACRPI2'].value_counts().plot(kind='bar')
health_red.columns.get_loc("R_MARITL")
#recode marital status
health_red['R_MARITL'] = health_red['R_MARITL'].replace([2,3], 1) #married
health_red['R_MARITL'] = health_red['R_MARITL'].replace([4], 2) #widowed
health_red['R_MARITL'] = health_red['R_MARITL'].replace([5,6], 3) #divorced
health_red['R_MARITL'] = health_red['R_MARITL'].replace([7,8,9], 4) #never married
health_red['R_MARITL'].value_counts().plot(kind='bar')
#recode missing values in part of dataframe
health_nan = health_red.iloc[:,0:76].replace([7,8,9], np.nan)
health_nan2 = health_red.iloc[:,76]
health_nan3 = pd.concat([health_nan,health_nan2],axis=1,join_axes=[health_nan.index])
health_nan3.head()
health_nan3.shape
#recode values in rest of dataframe to merge
health_rest = health_red.iloc[:,77:87].replace([96,97,98,99,996,997,998,999,9999], np.nan)
health_rest.head()
health_rest.shape
#merge dataframes with values 7,8,9,96,97,98,99,996,997,998,999,9999 recoded to NaN
mergedf=pd.concat([health_nan3,health_rest],axis=1,join_axes=[health_nan3.index])
mergedf.head()
mergedf['LOCALL1B'].shape
#test.isna().sum()
print(mergedf.isna().sum().to_string())
#remove NAs
health_na = mergedf.dropna()
#check number of NAs
health_na.isnull().sum().sum()
health_na.shape
health_na['LOCALL1B'].shape
health_na['ASISLEEP'].value_counts().plot(kind='bar')
health_na['ASISLEEP'].hist()
health_na.describe()
health_na[health_na.columns[0]].value_counts().plot(kind='bar')
health_na[health_na.columns[1]].value_counts().plot(kind='bar')
health_na[health_na.columns[2]].value_counts().plot(kind='bar')
health_na[health_na.columns[3]].value_counts().plot(kind='bar')
health_na[health_na.columns[4]].value_counts().plot(kind='bar')
health_na[health_na.columns[5]].value_counts().plot(kind='bar')
#plot all variables
# matplotlib.style.use('ggplot')
# health_na.plot.bar(subplots=True)
#save data frame without NAs to csv file
health_na.to_csv('health_na.csv', sep=",")
#create reduced categorical dataframe
health_cat = health_na[['SEX', 'R_MARITL', 'MRACRPI2', 'REGION', 'PAR_STAT', 'DOINGLWA', 'SUPERVIS', 'WRKCATA', 'HOURPDA', 'PDSICKA', 'WRKLYR4', 'HYPEV', 'HYBPLEV', 'CHLEV', 'CHDEV', 'MIEV', 'STREV', 'COPDEV', 'AASMEV', 'ULCEV', 'CANEV', 'DBHVPAY', 'DBHVCLY', 'DBHVWLY', 'DBHVPAN', 'DBHVCLN', 'DBHVWLN', 'DIBREL', 'DIBEV1', 'DIBPRE2', 'EPILEP1', 'AHAYFYR', 'SINYR', 'CBRCHYR', 'KIDWKYR', 'LIVYR', 'JNTSYMP', 'ARTH1', 'PAINECK', 'PAINLB', 'PAINFACE', 'AMIGR', 'ACOLD2W', 'AINTIL2W', 'AHEARST1', 'AVISION', 'VIM_GLEV', 'VIM_MDEV', 'VIMGLASS', 'AVISACT', 'CHPAIN6M', 'AHSTATYR', 'FLA1AR', 'SPECEQ', 'ALC1YR', 'CIGAREV2', 'ECIGEV2', 'SMKSTAT2', 'APLKIND', 'AWORPAY', 'ADNLONG2', 'ASRGYR', 'AMDLONGR', 'HIT1A', 'HIT2A', 'HIT3A', 'HIT4A', 'FLUVACYR', 'LIVEV', 'ASICPUSE', 'ASIRETR', 'ASIMEDC', 'ASISTLV', 'ASICNHC', 'AWEBUSE', 'YTQU_YG1', 'ALCSTAT']]
health_cat.head()
#check number of NAs
health_cat.isnull().sum().sum()
health_cat.shape
#create reduced numerical dataframe
health_num = health_na[['YRSWRKPA', 'ASISLEEP', 'AHEIGHT', 'AWEIGHTP', 'BMI', 'BEDDAYR', 'CLCKTP', 'AGE_P', 'AHCNOYR2', 'LOCALL1B']]
health_num.head()
health_num['LOCALL1B'].shape
#check number of NAs
health_num.isnull().sum().sum()
plt.show(health_num.boxplot(column=['YRSWRKPA', 'ASISLEEP', 'AHEIGHT','AWEIGHTP']))
plt.show(health_num.boxplot(column=['BMI']))
plt.show(health_num.boxplot(column=['BEDDAYR', 'CLCKTP', 'AGE_P', 'AHCNOYR2', 'LOCALL1B']))
# # z-score standard
# zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
# health_num_std = health_num.copy()
# health_num_std.apply(zscore).head()
#min max normalization of numerical variables
x = health_num.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
health_num_norm = pd.DataFrame(x_scaled, columns=health_num.columns)
health_num_norm.head()
health_num_norm['LOCALL1B'].shape
#check number of NAs
health_num_norm.isnull().sum()
plt.show(health_num_norm.boxplot(column=['YRSWRKPA', 'ASISLEEP', 'AHEIGHT', 'AWEIGHTP', 'BMI', 'BEDDAYR', 'CLCKTP', 'AGE_P', 'AHCNOYR2', 'LOCALL1B']))
#check for correlatin between numerical variables
health_num_norm.corr()
pd.plotting.scatter_matrix(health_num_norm, figsize=(16, 16), alpha = 0.2)
plt.show()
plt.figure(figsize=(10,10))
sns.heatmap(health_num_norm.corr(), cmap='BuGn')
#avr weight and avr height are higly correlated
#age and age on the job is correlated
#drop avr weight and age
health_num_norm_drop = health_num_norm.drop("AWEIGHTP",axis=1)
health_num_norm_drop['LOCALL1B'].shape
#check number of NAs
health_num_norm_drop.isnull().sum().sum()
#drop age
health_num_norm_drop = health_num_norm_drop.drop("AGE_P",axis=1)
#check number of NAs
health_num_norm_drop.isnull().sum().sum()
#Plot correlation
sns.heatmap(health_num_norm_drop.corr(), cmap='BuGn')
#re-run correlations
health_num_norm_drop.corr()
#save health_num_norm_drop to csv file
health_num_norm_drop.to_csv('health_num_norm_drop.csv', sep=",")
health_num_norm_drop.shape
#force conversion to categorical varibles
health_cat = health_cat.astype('category')
print(health_cat.dtypes.to_string())
health_cat['ALCSTAT'].shape
health_dummy = pd.get_dummies(health_cat,drop_first=True)
health_dummy.reset_index(drop=True, inplace=True)
health_dummy.head()
health_dummy['ALCSTAT_10'].shape
#check number of NAs
health_dummy.isnull().sum().sum()
columnNamesDummy = list(health_dummy.head(0))
print(columnNamesDummy)
#save dummy variables to csv file
health_dummy.to_csv('health_dummy.csv', sep=",")
health_dummy.shape
health_num.shape
health_num.isnull().sum().sum()
#reset index for health_num
health_num.reset_index(drop=True, inplace=True)
#merge categorical dummy with numerical non-normalized
health_cleaned=pd.concat([health_dummy,health_num],axis=1,join_axes=[health_dummy.index])
health_cleaned.tail()
# #merge categorical dummy with numerical normalized
# health_cleaned=pd.concat([health_dummy,health_num_norm_drop],axis=1,join_axes=[health_dummy.index])
# health_cleaned.tail()
health_cleaned['LOCALL1B'].shape
health_cleaned['SEX_2'].shape
health_cleaned.shape
#save health cleaned to csv file
health_cleaned.to_csv('health_cleaned.csv', sep=",")
#Y ->'DBHVPAY_2.0' = Told to increase physical activity, past 12 m
#create dataframe
health_cl = health_cleaned[['DBHVPAY_2.0', 'SEX_2', 'R_MARITL_2', 'R_MARITL_3', 'R_MARITL_4', 'MRACRPI2_2', 'MRACRPI2_3', 'MRACRPI2_4', 'REGION_2', 'REGION_3', 'REGION_4', 'PAR_STAT_2', 'PAR_STAT_3', 'DOINGLWA_2.0', 'DOINGLWA_3.0', 'DOINGLWA_4.0', 'DOINGLWA_5.0', 'SUPERVIS_2.0', 'WRKCATA_2.0', 'WRKCATA_3.0', 'WRKCATA_4.0', 'WRKCATA_5.0', 'WRKCATA_6.0', 'HOURPDA_2.0', 'PDSICKA_2.0', 'WRKLYR4_1.0', 'WRKLYR4_2.0', 'HYPEV_2.0', 'HYBPLEV_2.0', 'HYBPLEV_3.0', 'HYBPLEV_4.0', 'HYBPLEV_5.0', 'CHLEV_2.0', 'CHDEV_2.0', 'MIEV_2.0', 'STREV_2.0', 'COPDEV_2.0', 'AASMEV_2.0', 'ULCEV_2.0', 'CANEV_2.0', 'DBHVCLY_2.0', 'DBHVWLY_2.0', 'DBHVPAN_2.0', 'DBHVCLN_2.0', 'DBHVWLN_2.0', 'DIBREL_2.0', 'DIBEV1_3.0', 'DIBPRE2_2.0', 'EPILEP1_2.0', 'AHAYFYR_2.0', 'SINYR_2.0', 'CBRCHYR_2.0', 'KIDWKYR_2.0', 'LIVYR_2.0', 'JNTSYMP_2.0', 'ARTH1_2.0', 'PAINECK_2.0', 'PAINLB_2.0', 'PAINFACE_2.0', 'AMIGR_2.0', 'ACOLD2W_2.0', 'AINTIL2W_2.0', 'AHEARST1_2.0', 'AHEARST1_3.0', 'AHEARST1_4.0', 'AHEARST1_5.0', 'AHEARST1_6.0', 'AVISION_2.0', 'VIM_GLEV_2.0', 'VIM_MDEV_2.0', 'VIMGLASS_2.0', 'AVISACT_2.0', 'CHPAIN6M_2.0', 'CHPAIN6M_3.0', 'CHPAIN6M_4.0', 'AHSTATYR_2.0', 'AHSTATYR_3.0', 'FLA1AR_2', 'FLA1AR_3', 'SPECEQ_2.0', 'ALC1YR_2.0', 'CIGAREV2_2.0', 'ECIGEV2_2.0', 'SMKSTAT2_2.0', 'SMKSTAT2_3.0', 'SMKSTAT2_4.0', 'APLKIND_2.0', 'APLKIND_3.0', 'APLKIND_4.0', 'APLKIND_5.0', 'APLKIND_6.0', 'AWORPAY_2.0', 'AWORPAY_3.0', 'ADNLONG2_1.0', 'ADNLONG2_2.0', 'ADNLONG2_3.0', 'ADNLONG2_4.0', 'ADNLONG2_5.0', 'ASRGYR_2.0', 'AMDLONGR_1.0', 'AMDLONGR_2.0', 'AMDLONGR_3.0', 'AMDLONGR_4.0', 'AMDLONGR_5.0', 'HIT1A_2.0', 'HIT2A_2.0', 'HIT3A_2.0', 'HIT4A_2.0', 'FLUVACYR_2.0', 'LIVEV_2.0', 'ASICPUSE_2.0', 'ASICPUSE_3.0', 'ASICPUSE_4.0', 'ASIRETR_2.0', 'ASIRETR_3.0', 'ASIRETR_4.0', 'ASIMEDC_2.0', 'ASIMEDC_3.0', 'ASIMEDC_4.0', 'ASISTLV_2.0', 'ASISTLV_3.0', 'ASISTLV_4.0', 'ASICNHC_2.0', 'ASICNHC_3.0', 'ASICNHC_4.0', 'AWEBUSE_2.0', 'YTQU_YG1_2.0', 'ALCSTAT_2', 'ALCSTAT_3', 'ALCSTAT_5', 'ALCSTAT_6', 'ALCSTAT_7', 'ALCSTAT_8', 'ALCSTAT_9', 'ALCSTAT_10', 'YRSWRKPA', 'ASISLEEP', 'AHEIGHT', 'BMI', 'BEDDAYR', 'CLCKTP', 'AHCNOYR2', 'LOCALL1B']]
health_cl['DBHVPAY_2.0'].head()
#recode 'DBHVPAY_2.0' so 1 means Yes
physical = pd.get_dummies(health_cl['DBHVPAY_2.0'])
physical.columns = ["phys_yes", "phys_no"]
physical = physical.drop(columns=['phys_no'])
physical.head()
#repalce 'phys_yes' with 'DBHVPAY_YES'
health_tree=pd.concat([physical,health_cl],axis=1,join_axes=[physical.index])
health_tree = health_tree.drop(columns=['DBHVPAY_2.0'])
health_tree.columns.values[0] = "DBHVPAY_YES"
health_tree.head()
health_tree_drop = health_tree
#column names
health_names = health_tree_drop.columns.values
health_names
y = health_tree_drop['DBHVPAY_YES'] # y variable
X = health_tree_drop[health_names[1:]]
X.head()
y.head
#create train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99874)
print "X_train", X_train.shape
print "y_train", y_train.shape
print "X_test", X_test.shape
print "y_test", y_test.shape
############################################################################################
#Decistion Tree
############################################################################################
treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=3)
treeclf = treeclf.fit(X_train, y_train)
#A versatile function to measure performance of a classification model
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred = clf.predict(X)
if show_accuracy:
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n"
if show_classification_report:
print "Classification report"
print metrics.classification_report(y, y_pred),"\n"
if show_confussion_matrix:
print "Confussion matrix"
print metrics.confusion_matrix(y, y_pred),"\n"
measure_performance(X_test, y_test, treeclf, show_confussion_matrix=True, show_classification_report=True)
#predictions
treepreds_test = treeclf.predict(X_test)
print treepreds_test
#classification report
print(classification_report(y_test, treepreds_test))
treecm = confusion_matrix(y_test, treepreds_test)
print treecm
print treeclf.score(X_test, y_test)
print treeclf.score(X_train, y_train)
export_graphviz(treeclf,out_file='tree.dot', feature_names=X_train.columns, class_names=["No","Yes"])
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
system(dot -Tpng tree.dot -o dtree.png)
from IPython.display import Image
Image(filename='dtree.png', width=800)
########################################################################################
#Feature Selection
########################################################################################
#Select the top 30% of the most important features, using a chi2 test
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=30)
X_train_fs = fs.fit_transform(X_train, y_train)
np.set_printoptions(suppress=True, precision=2, linewidth=320)
print fs.get_support()
print fs.scores_
X.columns.values
col_fs = X.columns[fs.get_support()].values
print X.columns[fs.get_support()].values
for i in range(len(X.columns.values)):
if fs.get_support()[i]:
print X.columns.values[i],'\t', fs.scores_[i]
print X_train_fs
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=True)
#export_graphviz(dt,out_file='tree2.dot', feature_names=X_train_fs.columns, class_names=["No","Yes"])
export_graphviz(dt,out_file='tree2.dot', feature_names=col_fs, class_names=["No","Yes"])
with open("tree2.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
system(dot -Tpng tree2.dot -o dtree2.png)
from IPython.display import Image
Image(filename='dtree2.png', width=800)
#Find the best percentile for feature selection using cross-validation
from sklearn import cross_validation
dt = tree.DecisionTreeClassifier(criterion='entropy')
percentiles = range(1, 100, 5)
results = []
for i in range(1, 100, 5):
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
print i,scores.mean()
results = np.append(results, scores.mean())
# optimal_percentile = np.where(results == results.max())[0]
# print "Optimal percentile of features:{0}".format(percentiles[optimal_percentile]), "\n"
# optimal_num_features = int(percentiles[optimal_percentile]*len(X.columns)/100)
# print "Optimal number of features:{0}".format(optimal_num_features), "\n"
optimal_percentil = int(np.where(results == results.max())[0])
print "Optimal percentile of features is: ", percentiles[optimal_percentil]
#optimal_num_features = percentiles[optimal_percentile]*len(X.columns)/100
#optimal_num_features = int(math.floor(percentiles[optimal_percentil]*X.shape[1]/100))
optimal_num_features = int((percentiles[optimal_percentil]*X.shape[1]/100))
print "Optimal number of features is: ", optimal_num_features
# Plot percentile of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Percentage of features selected")
pl.ylabel("Cross validation accuracy")
pl.plot(percentiles,results)
#Impact of max-depth
from sklearn.cross_validation import KFold
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing scores with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
print param_name, ' = ', param_value
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(len(X), K, shuffle=True, random_state=0)
# iterate over the K folds
for j, (train, test) in enumerate(cv):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit([X[k] for k in train], y[train])
k_train_scores[j] = clf.score([X[k] for k in train], y[train])
k_test_scores[j] = clf.score([X[k] for k in test], y[test])
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
# plot the training and testing scores in a log scale
plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
# Let's create an evenly spaced range of numbers in a specified interval
md = np.linspace(1, 40, 20)
md = np.array([int(e) for e in md])
print md
train_scores, test_scores = calc_params(X_train, y_train, dt, md, 'max_depth', 5)
#max_depth = 4 looks the best
#min number of samples allowed at a leaf node
msl = np.linspace(1, 30, 15)
msl = np.array([int(e) for e in msl])
dt = tree.DecisionTreeClassifier(criterion='entropy')
train_scores, test_scores = calc_params(X_train, y_train, dt, msl, 'min_samples_leaf', 5)
#min_samples_leaf should be between 10 and 15
#Grid Search allows us to more systemiatically explore different combinations of multiple parameters
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
dt = tree.DecisionTreeClassifier()
# parameters = {
# 'criterion': ['entropy','gini'],
# 'max_depth': np.linspace(1, 20, 10),
# 'min_samples_leaf': np.linspace(10, 15, 5),
# 'min_samples_split': np.linspace(10, 15, 5)
# }
parameters = {
'criterion': ['entropy','gini'],
'max_depth': [1,2,3,4,5,6,8,10],
'min_samples_leaf': [2,4,6,8,10,12,14,16,18,20,],
'min_samples_split': [2,4,6,8,10,12,14,16,18,20]
}
gs = GridSearchCV(estimator=dt, param_grid=parameters, verbose=1, cv=5,refit=True, n_jobs = -1)
%time
gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
# Note: best parameters results change most of the time
# [Parallel(n_jobs=1)]: Done 8000 out of 8000 | elapsed: 2.9min finished
# ({'criterion': 'gini',
# 'max_depth': 5,
# 'min_samples_leaf': 20,
# 'min_samples_split': 2},
# 0.8076197957580519)
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=20, min_samples_split=2)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True)
from sklearn.tree import export_graphviz
export_graphviz(dt,out_file='treefinal.dot', feature_names=X_train.columns)
import graphviz
with open("treefinal.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
system(dot -Tpng treefinal.dot -o treefinal.png)
from IPython.display import Image
Image(filename='treefinal.png', width=800)
print(X_train.columns.values)
dt = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=10, min_samples_split=2)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=True, show_classification_report=True)
dt
dt.get_params
from sklearn.tree import export_graphviz
export_graphviz(dt,out_file='treefinal2.dot', feature_names=X_train.columns)
import graphviz
with open("treefinal2.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
system(dot -Tpng treefinal2.dot -o treefinal2.png)
from IPython.display import Image
Image(filename='treefinal2.png', width=800)
#variables returned by decision tree
#'DBHVCLY_2.0','BMI','DBHVPAN_2.0','DBHVWLY_2.0','CANEV_2.0','ASISTLV_4.0','AHCNOYR2','BEDDAYR',
#'DBHVCLN_2.0','AMDLONGR_1.0','AHCNOYR2','FLA1AR_2'
#create new data frame
#health_cleaned
health_tree.tail()
health_tree= pd.DataFrame(health_tree)
health_tree.head(1)
hn = health_tree[['DBHVPAY_YES','DBHVCLY_2.0','BMI','DBHVPAN_2.0','DBHVWLY_2.0','CANEV_2.0','ASISTLV_4.0','AHCNOYR2','BEDDAYR','DBHVCLN_2.0','AMDLONGR_1.0','AHCNOYR2','FLA1AR_2']]
hn.head()
y = hn['DBHVPAY_YES'] # y variable
X = hn[['DBHVCLY_2.0','BMI','DBHVPAN_2.0','DBHVWLY_2.0','CANEV_2.0','ASISTLV_4.0','AHCNOYR2','BEDDAYR','DBHVCLN_2.0','AMDLONGR_1.0','AHCNOYR2','FLA1AR_2']]
X.head()
#create train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9874)
print "X_train", X_train.shape
print "y_train", y_train.shape
print "X_test", X_test.shape
print "y_test", y_test.shape
##############################################################################################################
#Logistic Regression
##############################################################################################################
from sklearn.linear_model import LogisticRegression
#logistic regression with recursive feature elimination using cross validation
LogReg = LogisticRegression()
refcv = RFECV(LogReg, cv = 10, step = 1, n_jobs = -1)
fitcv = refcv.fit(X_train, y_train)
y_pred = fitcv.predict(X_test)
y_pred
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix
print(classification_report(y_test, y_pred))
#np.set_printoptions(threshold=np.inf)
np.set_printoptions(threshold=1000)
print "", y_pred
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
y_pred_proba = fitcv.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.legend(loc=4)
plt.show()
print "Area Under Curve AUC is ", auc
fitcv.score(X_train, y_train)
fitcv.score(X_test, y_test)
print("Num Features: %d") % fitcv.n_features_
print("Selected Features: %s") % fitcv.support_
lr_selected = fitcv.support_
lr_selected
lr_rank = fitcv.ranking_
print("Feature Ranking: %s") % fitcv.ranking_
lr_cols = X_train.columns.values
lr_cols
for i in range(len(lr_cols)):
#if lr_selected[i]:
print lr_cols[i],'\t', lr_rank[i]
#############################################################################################################
#logistic regression with cross validation using recursive feature elimination
#############################################################################################################
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFE
X_train.tail()
LogRegCV = LogisticRegressionCV(cv=10, random_state=123231, multi_class='ovr')
rfe = RFE(LogRegCV,5)
fit = rfe.fit(X_train, y_train)
y_pred_cv = fit.predict(X_test)
y_pred_cv
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_cv)
confusion_matrix
lg = classification_report(y_test, y_pred_cv)
print lg
#print(classification_report(y_test, y_pred_cv))
import pylab as plt
%matplotlib inline
plt.matshow(confusion_matrix)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_cv))
print("Precision:",metrics.precision_score(y_test, y_pred_cv))
print("Recall:",metrics.recall_score(y_test, y_pred_cv))
y_pred_proba_cv = fit.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba_cv)
auc = metrics.roc_auc_score(y_test, y_pred_proba_cv)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.legend(loc=4)
plt.show()
print "Area Under Curve AUC is ", auc
fit.score(X_train, y_train)
fit.score(X_test, y_test)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
lr_selected = fit.support_
lr_selected
lr_rank = fit.ranking_
print("Feature Ranking: %s") % fit.ranking_
lr_cols = X_train.columns.values
lr_cols
for i in range(len(lr_cols)):
#if lr_selected[i]:
print lr_cols[i],'\t', lr_rank[i]
# varaibles selected: DBHVCLY_2.0, DBHVPAN_2.0, DBHVWLY_2.0, AMDLONGR_1.0, FLA1AR_2
###############################################################################################
# Logistic Regression using stasmodels.org api
#################################################################################################
import statsmodels.api as sm
hn.head()
ys = health_tree_drop['DBHVPAY_YES'] # y variable
Xs = health_tree_drop[health_names[1:]]
ys.tail()
Xs.dtypes
Xs.dtypes
logit_model=sm.Logit(ys,Xs)
result=logit_model.fit()
print(result.summary2())
cols=['SEX_2', 'MRACRPI2_4', 'WRKCATA_2.0', 'WRKCATA_5.0', 'HOURPDA_2.0', 'PDSICKA_2.0', 'HYPEV_2.0', 'HYBPLEV_5.0', 'CHLEV_2.0', 'CANEV_2.0', 'DBHVCLY_2.0', 'DBHVWLY_2.0', 'DBHVPAN_2.0', 'DBHVCLN_2.0', 'DBHVWLN_2.0', 'DIBREL_2.0', 'DIBPRE2_2.0', 'AHEARST1_2.0', 'AHEARST1_4.0', 'AHEARST1_6.0', 'VIMGLASS_2.0', 'AVISACT_2.0', 'FLA1AR_2', 'APLKIND_5.0', 'AMDLONGR_4.0', 'AMDLONGR_5.0', 'HIT1A_2.0', 'HIT4A_2.0', 'ASICPUSE_2.0', 'ASICPUSE_4.0', 'ASIMEDC_2.0', 'ASISTLV_4.0', 'AWEBUSE_2.0', 'YTQU_YG1_2.0', 'ASISLEEP', 'BMI', 'AHCNOYR2']
cols[0:5]
Xs2= health_tree_drop[cols]
Xs2[0:5]
ys2= ys
ys2.head()
logit_model2=sm.Logit(ys2,Xs2)
result2=logit_model2.fit()
print(result2.summary2())
#crate test train
X_train2, X_test2, y_train2, y_test2 = train_test_split(Xs2, ys2, test_size=0.2, random_state=2323)
LogReg3 = LogisticRegression()
LogReg3.fit(X_train2, y_train2)
#predict y
y_pred2 = LogReg3.predict(X_test2)
print('Accuracy : {:.2f}'.format(LogReg3.score(X_test2, y_test2)))
#conf matrix
from sklearn.metrics import confusion_matrix
confusion_matrix3 = confusion_matrix(y_test2, y_pred2)
print(confusion_matrix3)
lg2 = classification_report(y_test2, y_pred2)
print lg2
y_pred_proba2 = LogReg3.predict_proba(X_test2)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test2, y_pred_proba2)
auc = metrics.roc_auc_score(y_test2, y_pred_proba2)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.legend(loc=4)
plt.show()
print "Area Under Curve AUC is ", auc
##############################################################################################################
#LDA
##############################################################################################################
ldclf = LinearDiscriminantAnalysis()
ldclf = ldclf.fit(X_train, y_train)
print "Score on Training: ", ldclf.score(X_train, y_train)
print "Score on Test: ", ldclf.score(X_test, y_test)
# 10-fold cross validation
cv_scores = cross_validation.cross_val_score(ldclf, X, y, cv=5)
cv_scores
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
##############################################################################################################
#Clustering numerical variables
##############################################################################################################
health_cl = health_tree
print health_cl.shape
health_cl.tail()
health_cl_num = health_cl[['YRSWRKPA','ASISLEEP','AHEIGHT','BMI','BEDDAYR','CLCKTP','AHCNOYR2','LOCALL1B']]
health_cl_num.tail()
#min max normalization of numerical variables
x = health_cl_num.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
health_cl_num_norm = pd.DataFrame(x_scaled, columns=health_cl_num.columns)
health_cl_num_norm.columns = ['YRSWRKPA_NORM','ASISLEEP_NORM','AHEIGHT_NORM','BMI_NORM','BEDDAYR_NORM','CLCKTP_NORM','AHCNOYR2_NORM','LOCALL1B_NORM']
health_cl_num_norm.head()
health_cl_num_norm.dtypes
#Xcl = np.array(health_cl2.drop(['DBHVPAY_YES'], 1).astype(float64))
#Xcl = np.array(health_cl2.drop(['DBHVPAY_YES'], 1))
Xcl = np.array(health_cl_num_norm)
Xcl[0:5]
#
kmeans = KMeans(n_clusters=5, max_iter=500, verbose = 0)
kmeans.fit(Xcl)
#categorical dataset
health_cl1 = health_cl.iloc[:,0:135]
health_cl1.tail()
#merge
health_cl2 = pd.concat([health_cl1,health_cl_num_norm],axis=1,join_axes=[health_cl1.index])
health_cl2.tail()
#Xcl = np.array(health_cl2.drop(['DBHVPAY_YES'], 1).astype(float64))
#Xcl = np.array(health_cl2.drop(['DBHVPAY_YES'], 1))
Xcl = np.array(health_cl_num_norm)
Xcl[0:5]
ycl = np.array(health_cl2['DBHVPAY_YES'])
ycl[0:5]
#
kmeans = KMeans(n_clusters=5, max_iter=500, verbose = 1)
kmeans.fit(Xcl)
clusters = kmeans.predict(Xcl)
print clusters
clusters.shape
ycl.shape
print completeness_score(ycl,clusters)
print homogeneity_score(ycl,clusters)
#this clustering model cannot be used, it was an experiment for clustering numerical data with binary y variable.